Preparation

Library

reticulate::use_condaenv("/Users/pixushi/miniconda3/envs/qiime2-2021.2")
rm(list=ls())
# for data 
library(readr) # read tsv
library(qiime2R) # read in Qiime artifacts
library(dplyr) # data formatting
library(yaml) # for read_qza() in qiime2R
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
# for computing
library(reticulate) # run py codes
## Warning: package 'reticulate' was built under R version 4.3.3
library(phyloseq) # phyloseq object
## Warning: package 'phyloseq' was built under R version 4.3.1
library(vegan) # distance matrix
## Warning: package 'vegan' was built under R version 4.3.3
## Warning: package 'lattice' was built under R version 4.3.2
library(PERMANOVA) # permanova
## Warning: package 'deldir' was built under R version 4.3.2
library(randomForest) # random forest
library(PRROC) # roc and pr
library(tempted)
## Warning: package 'tempted' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
library(microTensor)
# for plotting
library(ggpubr)
library(ggplot2)
library(gridExtra)
library(RColorBrewer)
library(plotly)

# set working directory to be where the current script is located
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

col_group <- c("#5A5A5A", brewer.pal(7,'Set1')[c(5,3)])
col_microbe <- c(brewer.pal(6,'Set1')[c(2,4)], brewer.pal(6,'Set2')[4])
npc <- 3 # number of components
bws <- 8 # for plotting

Read in the count data, meta data, distance matrices

meta_all <- read.csv("data/metadata_cleaned.csv", row.names=1)
meta_all$group <- factor(meta_all$group, levels=
                           c("genotype=WT, disease=no",
                             "genotype=Pax5+/-, disease=no",
                             "genotype=Pax5+/-, disease=pB-ALL"))
levels(meta_all$group) <- c("healthy wild type", "predisposed, remained healthy", "predisposed, developed pB-ALL")
count_all <- read.csv("data/count_cleaned.csv", row.names=1)
dim(count_all)
## [1]  327 3287
load("data/distance_mat.Rdata")
metauni <- unique(meta_all[,c('hostID', 'genotype', 'diseased', 'group')])
metauni$group <- factor(metauni$group, sort(levels(metauni$group)))
rownames(metauni) <- metauni$hostID
table(metauni$group)
## 
##             healthy wild type predisposed, developed pB-ALL 
##                            17                            17 
## predisposed, remained healthy 
##                            10

Plot timeline of study

meta_ordered <- meta_all[order(meta_all$group, meta_all$host_subject_id, decreasing=T),]
meta_ordered$host_subject_id <- factor(meta_ordered$host_subject_id, 
                                       levels=unique(meta_ordered$host_subject_id))
colnames(meta_ordered)[22] <- "Group"
p_timeline <- ggplot(data=meta_ordered, 
       aes(x=week, y=host_subject_id, 
           group=as.factor(host_subject_id), color=Group, shape=Group)) +
  geom_line() + geom_point() + scale_color_manual(values=col_group) +
  labs(y="Host ID", x="Week") + theme_bw() +
  theme(legend.position="bottom") + 
  scale_x_continuous(breaks=seq(from=0,to=90,by=5))
p_timeline

pdf('../figure_table/leukemia_timeline.pdf', width=7.7, height=5)
print(p_timeline)
dev.off()
## quartz_off_screen 
##                 2

TEMPTED

Run analysis

datlist_all <- format_tempted(count_all, meta_all$week, meta_all$hostID, 
                            threshold=0.95, pseudo=0.5, transform='clr')
print(dim(datlist_all[[1]]))
## [1] 1065   10
svd_all <- svd_centralize(datlist_all, 1)
res_tempted_all <- tempted(svd_all$datlist, r = npc, resolution = 51, smooth=1e-4)
## Calculate the 1th Component
## Convergence reached at dif=1.29075362408583e-05, iter=3
## Calculate the 2th Component
## Convergence reached at dif=7.67492932283745e-05, iter=9
## Calculate the 3th Component
## Convergence reached at dif=3.20041217209923e-05, iter=7
save(svd_all, res_tempted_all, datlist_all,
     file='result/res_leukemia_tempted.Rdata')
load('result/res_leukemia_tempted.Rdata')

res_tempted <- res_tempted_all
svd_tempted <- svd_all
meta_tab <- meta_all
count_tab <- count_all
datlist <- datlist_all

Plot TEMPTED time loading

p_time_tempted <- plot_time_loading(res_tempted, r=3) + scale_color_manual(values=col_microbe) + 
  labs(x='Week', y="Temporal Loading", color="Component") + 
  geom_line(linewidth=1.5) + theme_bw() + 
  theme(legend.position='bottom')

Plot TEMPTED subject loadings

A_hat <- metauni
rownames(A_hat) <- A_hat$hostID
table(rownames(A_hat)==rownames(res_tempted$A_hat))
## 
## TRUE 
##   44
A_hat <- cbind(res_tempted$A_hat[,1:3], A_hat)
A_hat$group <- factor(A_hat$group, 
  levels=c("healthy wild type", "predisposed, remained healthy", "predisposed, developed pB-ALL"))
p_sub_tempted <- plot_ly(A_hat, x=~PC1, y=~PC2, z=~PC3, 
             color=~group, colors=col_group)
p_sub_tempted <- p_sub_tempted %>% add_markers(marker=list(size=5))
p_sub_tempted <- p_sub_tempted %>% layout(scene=list(xaxis=list(title="Component 1", titlefont = list(size = 20)), 
                             yaxis=list(title="Component 2", titlefont = list(size = 20)), 
                             zaxis=list(title="Component 3", titlefont = list(size = 20))),
                          legend = list(font = list(size = 20), orientation = "h"))
p_sub_tempted
htmlwidgets::saveWidget(widget=p_sub_tempted, 
                        file="../figure_table/leukemia_sub_tempted.html",
                        selfcontained=F)
p_sub23_tempted <- ggplot(data=A_hat, aes(x=PC2, y=PC3)) +
  geom_point(aes(color=group)) + 
  scale_color_manual(values=col_group) + 
  theme_bw() +
  labs(x="Component 2", y="Component3") +
  theme(legend.position='bottom')

Plot TEMPTED trajectory of log ratio of top features

datlist_raw <- format_tempted(count_all, meta_all$week, meta_all$hostID, 
                            threshold=0.95, transform='none')
ratio_feat <- ratio_feature(res_tempted, datlist_raw, pct=0.005)

## summed up, by individual subject
tab_feat_ratio <- ratio_feat$metafeature_ratio
colnames(tab_feat_ratio)[2] <- 'hostID'
tab_feat_ratio <- merge(tab_feat_ratio, metauni)

## summed up, by mean and sd
reshape_feat_ratio <- reshape(tab_feat_ratio, 
                            idvar=c("hostID","timepoint") , 
                            v.names=c("value"), timevar="PC",
                            direction="wide")
CC <- grep("value", colnames(reshape_feat_ratio))
colnames(reshape_feat_ratio)[CC] <- paste("Component", 1:length(CC))
feature_mat_ratio <- reshape_feat_ratio[,c("Component 2", "Component 3")]

time_vec_ratio <- reshape_feat_ratio$timepoint
group_vec_ratio <- factor(reshape_feat_ratio$group, 
                        levels=c("healthy wild type", "predisposed, remained healthy", "predisposed, developed pB-ALL"))
p_feat_ratio_summary <- plot_feature_summary(feature_mat_ratio, 
                                           time_vec_ratio, 
                                           group_vec_ratio, bws=bws, nrow=1) + 
  xlab('Week') + theme_bw() + 
  theme(legend.position='bottom') +
  scale_color_manual(values=col_group) + scale_fill_manual(values=col_group)

Plot all results for main text

lay <- rbind(c(1,2,3,3), c(1,2,3,3), c(1,2,3,3), c(1,2,3,3), c(1,2,3,3), 
             c(4,5,5,5))
lgd1 <- get_legend(p_time_tempted)
lgd2 <- get_legend(p_sub23_tempted)
p_tempted_all <- grid.arrange(p_time_tempted+theme(legend.position="none"),
                          p_sub23_tempted+theme(legend.position="none"),
                          p_feat_ratio_summary+theme(legend.position="none"),
                          lgd1, lgd2,
                          layout_matrix=lay)

ggsave('../figure_table/leukemia_tempted.pdf', width=12, height=3.5, plot=p_tempted_all)

Plot top features associated with disease for supplementary

# the log ratio is constructed using the following features
ftnames <- rownames(datlist_raw[[1]])[-1]
tab_ftnames <- NULL
for (ii in 1:npc){
  tab_ftnames <- rbind(tab_ftnames, 
                       data.frame(sequence=ftnames[ratio_feat$toppct[,ii]],
                                  Component=ii, rank="Top"))
  tab_ftnames <- rbind(tab_ftnames, 
                       data.frame(sequence=ftnames[ratio_feat$bottompct[,ii]],
                                  Component=ii, rank="Bottom"))
}
taxtab <- read.csv("data/gg2-taxonomy.tsv", sep="\t", row.names=1)
tab_ftnames <- cbind(tab_ftnames, taxtab[tab_ftnames$sequence,])
tab_taxsplit <- t(matrix(unlist(lapply(tab_ftnames$Taxon, function(x){strsplit(x, "; ")[[1]]})), nrow=7))
colnames(tab_taxsplit) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
tab_ftnames <- cbind(tab_ftnames, tab_taxsplit)
shortname <- tab_taxsplit[, "Species"]
for (j in colnames(tab_taxsplit)[6:1]){
  shortname[nchar(shortname)<=3] <- tab_taxsplit[nchar(shortname)<=3, j]
}
shortname <- tab_taxsplit[, "Species"]
for (j in colnames(tab_taxsplit)[6:1]){
  shortname[nchar(shortname)<=3] <- tab_taxsplit[nchar(shortname)<=3, j]
}
for (ii in 1:(length(shortname)-1)){
  ind <- grep(shortname[ii], shortname)
  if (length(ind)>1) {
    shortname[ind[-1]] <- paste0(shortname[ind[-1]], "_", 1+1:length(ind[-1]))
  }
}
tab_ftnames$shortname <- shortname
write.csv(tab_ftnames, file="../figure_table/leukemia_topfeature.csv")
for (ii in 2:3){
  tab_ftnames_pc <- dplyr::filter(tab_ftnames, Component==ii)
  topfeat_pc <- tab_ftnames_pc$sequence
  
  prop_all <- (count_all)/rowSums(count_all)
  if (ii==2){
    feat_mat_pc <- prop_all[meta_all$genotype!="WT",topfeat_pc]
    meta_pc <- meta_all[meta_all$genotype!="WT",]
  }
  if (ii==3){
    feat_mat_pc <- prop_all[,topfeat_pc]
    meta_pc <- meta_all
  } 
  nfeat <- ncol(feat_mat_pc)

  tab_topfeat <- data.frame(RA=as.vector(as.matrix(feat_mat_pc)),
                            feature=rep(tab_ftnames_pc$shortname, each=nrow(feat_mat_pc)),
                            time=rep(meta_pc$week, nfeat),
                            group=rep(meta_pc[,ifelse(ii==2, "group", "genotype")], nfeat),
                            hostID=rep(meta_pc$hostID, nfeat))
  p_topfeat_tempted <- ggplot(data=tab_topfeat) + 
    geom_point(aes(x=time, y=RA, color=group), size=0.8) + 
    facet_wrap(vars(feature), nrow=3) + 
    theme_bw() + 
    labs(x="Week", y="Relative Abundance", color=ifelse(ii==2, "Outcome", "Genotype")) + 
    ggtitle(paste0("Top Features Identified by Component ", ii)) + theme(legend.position="bottom") + 
     scale_y_sqrt()
  if (ii==2){
    p_topfeat_tempted <- p_topfeat_tempted + geom_vline(xintercept = 35, linetype="dashed", 
                color = "grey", linewidth=1)
  }
  print(p_topfeat_tempted)
  pdf(paste0("../figure_table/leukemia_topfeat", ii, ".pdf"), height=5, width=10)
  print(p_topfeat_tempted)
  dev.off()
}

CTF

Run analysis

py_run_file(file="analysis_leukemia_ctf.py",convert=F)

CTF subject loadings

ctf_sub <- read_qza("result/subject-biplot_leukemia.qza")$data$Vectors
ctf_sub$group <- metauni[ctf_sub$SampleID, "group"]
p_sub_ctf <- plot_ly(ctf_sub, x=~`PC1`, y=~`PC2`, z=~`PC3`, 
                 color=~group, colors=col_group)
p_sub_ctf <- p_sub_ctf %>% add_markers(marker=list(size=5))
p_sub_ctf <- p_sub_ctf %>% layout(scene=list(xaxis=list(title="Component 1", titlefont = list(size = 20)), 
                                     yaxis=list(title="Component 2", titlefont = list(size = 20)), 
                                     zaxis=list(title="Component 3", titlefont = list(size = 20))),
                          legend = list(font = list(size = 20), orientation = "h"))
p_sub_ctf
htmlwidgets::saveWidget(widget=p_sub_ctf, 
                        file="../figure_table/leukemia_sub_ctf.html",
                        selfcontained=F)
p_sub12_ctf <- ctf_sub %>% ggplot() + 
  geom_point(aes(x=PC1, y=PC2, color=group)) +
  scale_color_manual(values=col_group) +
  theme_bw() +
  theme(legend.position="bottom")
p_sub13_ctf <- ctf_sub %>% ggplot() + 
  geom_point(aes(x=PC1, y=PC3, color=group)) +
  scale_color_manual(values=col_group) + 
  theme_bw() + 
  theme(legend.position='bottom')
p_sub23_ctf <- ctf_sub %>% ggplot() + 
  geom_point(aes(x=PC2, y=PC3, color=group)) +
  scale_color_manual(values=col_group) +
  theme_bw() + 
  theme(legend.position='bottom')
g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

CTF trajectories

# function to read in trajectories
read_traj <- function(file){
  tmp <- tempdir()
  rm <- TRUE
  unzip(file, exdir=tmp) 
  unpacked<-unzip(file, exdir=tmp, list=TRUE)
  artifact<-read_yaml(paste0(tmp,"/", paste0(gsub("/..+","", unpacked$Name[1]),"/metadata.yaml")))
  artifact$contents<-data.frame(files=unpacked)
  artifact$contents$size=sapply(paste0(tmp, "/", artifact$contents$files), file.size)
  artifact$version=read.table(paste0(tmp,"/",artifact$uuid, "/VERSION"))
  artifact$data <- read_tsv(paste0(tmp,"/",artifact$uuid,"/data/trajectory.tsv"))
  return(artifact)
}
ctf_traj <- as.data.frame(read_traj("result/state-subject-ordination_leukemia.qza")$data)
## Rows: 327 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): #SampleID, subject_id, group
## dbl (5): PC1, PC2, PC3, week_int, week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ctf_traj$group <- meta_all[ctf_traj$`#SampleID`, "group"]
colnames(ctf_traj)[2:4] <- paste("Component", 1:3)
p_traj_ctf <- plot_feature_summary(ctf_traj[,c("Component 1","Component 2","Component 3")], 
                     ctf_traj$week_int, 
                     ctf_traj$group,
                     bws=bws) + 
  xlab('Week') + theme_bw() + 
  theme(legend.position='bottom') +
  scale_color_manual(values=col_group) + 
  scale_fill_manual(values=col_group)

CTF time loadings

ctf_time <- read_qza("result/state-biplot_leukemia.qza")$data$Vectors
colnames(ctf_time)[1] <- "week_int"
tab_time <- data.frame(time=ctf_time$week_int, value=as.vector(as.matrix(ctf_time[,2:4])), 
                       component=as.factor(as.vector(t(matrix(rep(1:3,nrow(ctf_time)),3,)))))
p_time_ctf <- tab_time %>% ggplot(aes(x=time, y=value, color=component)) + 
  geom_line(linewidth=1.5) + 
  scale_color_manual(values=col_microbe) + 
  labs(x='Week', y="Temporal Loading", color="Component") + 
  theme_bw() + 
  theme(legend.position='bottom')

Plot all results

lay <- rbind(c(1,2,3),c(1,2,3),c(1,2,3),c(1,2,3),c(1,2,3),
             c(4,4,4),c(4,4,4),c(4,4,4),c(4,4,4),c(4,4,4),
             c(5,5,5),
             c(6,6,6))
lgd1 <- get_legend(p_time_ctf)
lgd2 <- get_legend(p_sub23_ctf)
p_ctf_all <- grid.arrange(
  p_time_ctf+theme(legend.position="none"),
  p_sub12_ctf+theme(legend.position="none"),
  p_sub23_ctf+theme(legend.position="none"),
  p_traj_ctf+theme(legend.position="none"),
  lgd1, lgd2, 
  layout_matrix=lay)

ggsave("../figure_table/leukemia_ctf.pdf", width=8, height=5.5, plot=p_ctf_all)

microTensor

Run analysis

count_tab <- count_all[,colMeans(count_all==0)<=0.95]
meta_tab <- meta_all[,c("sample_name", "hostID", "week", "group", "genotype", "diseased")]
meta_tab$time_disc <- round(meta_tab$week)
tm <- sort(unique(meta_tab$time_disc))
metauni <- unique(meta_tab[,c("hostID", "group", "genotype", "diseased")])
rownames(metauni) <- metauni$hostID
      
X_array <- array(NA, dim = c(ncol(count_tab),
                         length(unique(meta_tab$hostID)),
                         length(tm)))
dimnames(X_array) <- list(colnames(count_tab),
                          unique(meta_tab$hostID),
                          tm)
for(k in 1:length(tm)) {
  k_df_samples <- meta_tab %>% 
    dplyr::filter(time_disc == tm[k])
  k_df_samples <- k_df_samples[!duplicated(k_df_samples$hostID),]
  X_array[, k_df_samples$hostID, k] <- 
    t(count_tab[rownames(k_df_samples), ])
}
mean(is.na(X_array)) # check proportion of NAs

set.seed(1)
fit_microTensor <- 
  microTensor::microTensor(X = X_array, R = npc, 
                           nn_t = TRUE, ortho_m = TRUE,
                           weighted = TRUE)
micro_sub <- as.data.frame(fit_microTensor$s)
colnames(micro_sub) <- paste0("PC",1:npc)
micro_sub$hostID <- dimnames(X_array)[[2]]
micro_sub <- merge(metauni, micro_sub)
write.csv(micro_sub, file="result/subject_microtensor_leukemia.csv")
micro_time <- as.data.frame(fit_microTensor$t)
colnames(micro_time) <- paste0("PC",1:3)
micro_time$time <- tm
write.csv(micro_time, file="result/time_microtensor_leukemia.csv")

micro_traj <- microTensor::create_loading(fit_decomp = fit_microTensor,
                                          feature_names = dimnames(X_array)[[1]],
                                          subject_names = dimnames(X_array)[[2]],
                                          time_names = tm,
                                          class = "sample")
colnames(micro_traj) <- c("hostID", "time_disc", paste0("PC",1:npc))
micro_traj <- merge(meta_tab[,c("sample_name", "hostID", "time_disc", "group", "genotype", "diseased")], micro_traj)
write.csv(micro_traj, file="result/trajectory_microtensor_leukemia.csv")

microTensor subject loading

micro_sub <- read.csv(file="result/subject_microtensor_leukemia.csv", row.names=1, header=T)
p_sub_micro <- plot_ly(micro_sub, x=~PC1, y=~PC2, z=~PC3, 
             color=~group, colors=col_group)
p_sub_micro <- p_sub_micro %>% add_markers(marker=list(size=5))
p_sub_micro <- p_sub_micro %>% layout(scene=list(xaxis=list(title="Component 1", titlefont = list(size = 20)), 
                             yaxis=list(title="Component 2", titlefont = list(size = 20)), 
                             zaxis=list(title="Component 3", titlefont = list(size = 20))),
                          legend = list(font = list(size = 20), orientation = "h"))
p_sub_micro
htmlwidgets::saveWidget(widget=p_sub_micro, 
                        file="../figure_table/leukemia_sub_microtensor.html",
                        selfcontained=F)

p_sub12_micro <- micro_sub %>% ggplot() + 
  geom_point(aes(x=PC1, y=PC2, color=group)) +
  scale_color_manual(values=col_group) +
  theme_bw() +
  theme(legend.position="bottom")
p_sub13_micro <- micro_sub %>% ggplot() + 
  geom_point(aes(x=PC1, y=PC3, color=group)) +
  scale_color_manual(values=col_group) + 
  theme_bw() + 
  theme(legend.position='bottom')
p_sub23_micro <- micro_sub %>% ggplot() + 
  geom_point(aes(x=PC2, y=PC3, color=group)) +
  scale_color_manual(values=col_group) +
  theme_bw() + 
  theme(legend.position='bottom')
g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

microTensor trajectories

micro_traj <- read.csv(file="result/trajectory_microtensor_leukemia.csv", header=T, row.names=1)
colnames(micro_traj)[7:9] <- paste("Component", 1:3)
p_traj_micro <- plot_feature_summary(micro_traj[,c("Component 1","Component 2","Component 3")], 
                     micro_traj$time_disc, 
                     micro_traj$group,
                     bws=bws) + 
  xlab('Week') + theme_bw() + 
  theme(legend.position='bottom') +
  scale_color_manual(values=col_group) + 
  scale_fill_manual(values=col_group)

microTensor time loading

micro_time <- read.csv("result/time_microtensor_leukemia.csv", header=T)
tab_micro_time <- tidyr::pivot_longer(micro_time, 
                        cols = c(PC1, PC2, PC3), 
                        names_to = "PC", 
                        values_to = "value")
p_time_micro <- ggplot(data=tab_micro_time) + 
  geom_line(linewidth=1.5, aes(x=time, y=value, color=PC)) + 
  scale_color_manual(values=col_microbe) + 
  labs(x="Study Day", y="Time Loading", color="Component") +
  theme_bw() + 
  theme(legend.position='bottom')

Plot all results

lay <- rbind(c(1,2,3),c(1,2,3),c(1,2,3),c(1,2,3),c(1,2,3),
             c(4,4,4),c(4,4,4),c(4,4,4),c(4,4,4),c(4,4,4),
             c(5,5,5),
             c(6,6,6))
lgd1 <- get_legend(p_time_micro)
lgd2 <- get_legend(p_sub23_micro)
p_micro_all <- grid.arrange(
  p_time_micro+theme(legend.position="none"),
  p_sub12_micro+theme(legend.position="none"),
  p_sub23_micro+theme(legend.position="none"),
  p_traj_micro+theme(legend.position="none"),
  lgd1, lgd2, 
  layout_matrix=lay)

ggsave("../figure_table/leukemia_microtensor.pdf", width=8, height=5.5, plot=p_micro_all)

PCoA

# get mds results
mds_mat_all <- NULL
for (i in 1:length(dm_list)){
  dist_tab <- dm_list[[i]]
  mds_all <- cmdscale(dist_tab, k=3)
  mds_mat <- data.frame(hostID=meta_all[rownames(mds_all),'hostID'],
                        PC=mds_all)
  mds_mat$timepoint <- meta_all[rownames(mds_all),'week']
  mds_mat <- merge(mds_mat, metauni, by='hostID')
  mds_mat$method <- names(dm_list)[i]
  mds_mat_all <- rbind(mds_mat_all, mds_mat)
}
mds_mat_all$group <- factor(mds_mat_all$group, 
                            levels=c("healthy wild type", "predisposed, remained healthy", "predisposed, developed pB-ALL"))
colnames(mds_mat_all)
## [1] "hostID"    "PC.1"      "PC.2"      "PC.3"      "timepoint" "genotype" 
## [7] "diseased"  "group"     "method"
mds_mat_all$method <- as.factor(mds_mat_all$method)
levels(mds_mat_all$method)
## [1] "bray"     "jaccard"  "unifrac"  "wunifrac"
levels(mds_mat_all$method) <- c("Bray-Curtis", "Jaccard", "UniFrac", "Weighted UniFrac")
names(dm_list)
## [1] "unifrac"  "wunifrac" "bray"     "jaccard"
names(dm_list) <- c("UniFrac", "Weighted UniFrac", "Bray-Curtis", "Jaccard")

Week 35-70 PERMANOVA for only Pax mice wrt disease

# other methods
mds_mat_Pax <- dplyr::filter(mds_mat_all, genotype!='WT')
# result from tempted
ratio_mat_Pax <- dplyr::filter(reshape_feat_ratio, genotype!='WT')

mth <- levels(mds_mat_all$method)
pval_permanova_late <- rep(NA, length(mth)+2)
names(pval_permanova_late) <- c(mth, "TEMPTED", "CTF")
Fval_permanova_late <- pval_permanova_late
for (j in 1:length(mth)){
  tmp <- dplyr::filter(mds_mat_Pax, 
                       timepoint>=35 & timepoint<=70 & method==mth[j])
  res_adonis <- adonis2(tmp[,2:4] ~ diseased, method='euclidean',
                       data=tmp)
  pval_permanova_late[j] <- res_adonis$`Pr(>F)`[1]
  Fval_permanova_late[j] <- res_adonis$F[1]
}
# for TEMPTED
tmp <- dplyr::filter(ratio_mat_Pax, timepoint>=35 & timepoint<=70)
res_adonis <- adonis2(tmp[,6:8] ~ diseased, method='euclidean',
                     data=tmp)
pval_permanova_late[length(mth)+1] <- res_adonis$`Pr(>F)`[1]
Fval_permanova_late[length(mth)+1] <- res_adonis$F[1]
# for CTF
tmp <- dplyr::filter(ctf_traj, week_int>=35 & week_int<=70 & group!="healthy wild type")
res_adonis <- adonis2(tmp[,2:4] ~ tmp$group, method='euclidean')
pval_permanova_late[length(mth)+2] <- res_adonis$`Pr(>F)`[1]
Fval_permanova_late[length(mth)+2] <- res_adonis$F[1]

cbind(pval_permanova_late,
      Fval_permanova_late)
##                  pval_permanova_late Fval_permanova_late
## Bray-Curtis                    0.133           1.9960173
## Jaccard                        0.157           1.9300087
## UniFrac                        0.334           0.9555013
## Weighted UniFrac               0.809           0.1450785
## TEMPTED                        0.002           6.9220221
## CTF                            0.026           3.7071993
write.csv(cbind(pval_permanova_late,
                Fval_permanova_late), file="../figure_table/leukemia_permanova.csv")

Trajectory plots for other methods and Wilcoxon rank test of Week 35-70 samples

group_level <- levels(mds_mat_all$group)
nmethod <- length(dm_list)
CI_length <- -qnorm((1-0.95)/2)
tab_summary_all <- NULL
pval_wilcox_late <- Wval_wilcox_late <- matrix(NA, nmethod+2, npc)
rownames(pval_wilcox_late) <- rownames(Wval_wilcox_late) <- c(names(dm_list), "TEMPTED", "CTF")
for (kk in 1:nmethod){
  mthd <- names(dm_list)[kk]
  time_all <- NULL
  mean_all <- NULL
  merr_all <- NULL
  feature_all <- NULL
  group_all <- NULL
  ind_mthd <- mds_mat_all$method==mthd
  feature_mat <- mds_mat_all[ind_mthd,c(1+1:npc)]
  time_vec <- mds_mat_all$timepoint[ind_mthd]
  group_vec <- mds_mat_all$group[ind_mthd]
  
  # Wilcoxon test
  for (jj in 1:ncol(feature_mat)){
    sel <- time_vec>=35 & time_vec<=70 & group_vec!='healthy wild type'
    table(group_vec[sel])
    test_tmp <- wilcox.test(feature_mat[sel,jj]~group_vec[sel])
    pval_wilcox_late[kk,jj] <- test_tmp$p.value
    Wval_wilcox_late[kk,jj] <- test_tmp$statistic
  }
  for (jj in 1:ncol(feature_mat)){
    for (ii in 1:length(group_level)){
      ind <- group_vec==group_level[ii]
      model.np <- npreg(feature_mat[ind,jj]~time_vec[ind], bws=bws,
                          regtyle="ll", bwmethod="cv.aic", gradients=T)
      time_eval <- as.vector(t(model.np$eval))
      mean_eval <- model.np$mean[order(time_eval)]
      merr_eval <- model.np$merr[order(time_eval)]
      time_eval <- sort(time_eval)
      
      time_all <- c(time_all, time_eval)
      mean_all <- c(mean_all, mean_eval)
      merr_all <- c(merr_all, merr_eval)
      feature_all <- c(feature_all, 
                       rep(colnames(feature_mat)[jj], length(time_eval)))
      group_all <- c(group_all,
                     rep(group_level[ii], length(time_eval)))
    }
  }
  group_all <- factor(group_all, levels=group_level)
  tab_summary <- data.frame(time=time_all, mean=mean_all, merr=merr_all,
                            group=group_all, feature=feature_all)
  tab_summary$method <- names(dm_list)[kk]
  tab_summary_all <- rbind(tab_summary_all, tab_summary)
}
colnames(pval_wilcox_late) <- colnames(Wval_wilcox_late) <- colnames(feature_mat)
pval_wilcox_late <- as.data.frame(pval_wilcox_late)
Wval_wilcox_late <- as.data.frame(Wval_wilcox_late)

# for TEMPTED
sel <- time_vec_ratio>=35 & time_vec_ratio<=70 & 
  group_vec_ratio!='healthy wild type'
wilcox_tempted <- wilcox.test(feature_mat_ratio$`Component 2`[sel]~
                             group_vec_ratio[sel])
pval_wilcox_late["TEMPTED","PC.2"] <- wilcox_tempted$p.value
Wval_wilcox_late["TEMPTED","PC.2"] <- wilcox_tempted$statistic
# for CTF
ctf_traj_late <- dplyr::filter(ctf_traj, week_int>=35 & week_int<=70 & 
  group!='healthy wild type')
wilcox_ctf <-  wilcox.test(ctf_traj_late$`Component 1`~ctf_traj_late$group)
pval_wilcox_late["CTF","PC.1"] <- wilcox_ctf$p.value
Wval_wilcox_late["CTF","PC.1"] <- wilcox_ctf$statistic

tab_summary_all$method <- factor(tab_summary_all$method, 
                                 levels=unique(tab_summary_all$method))
tab_summary_all$feature <- factor(tab_summary_all$feature, 
                                  levels=unique(tab_summary_all$feature))
levels(tab_summary_all$feature) <- paste0("Component ", 1:3)
p_summary <- ggplot(data=tab_summary_all, 
                    aes(x=time, y=mean, group=group, color=group)) + 
  geom_line() + theme_bw()+ 
  geom_ribbon(aes(ymin=mean-CI_length*merr, ymax=mean+CI_length*merr, 
                  color=group, fill=group), linetype=2, alpha=0.3) + 
  ylab(paste0('mean +/- ', round(CI_length,2), '*se')) + xlab("Week") +
  facet_grid(feature~method) + 
  theme(legend.position="bottom") + 
  scale_color_manual(values=col_group) + 
  scale_fill_manual(values=col_group) + 
  scale_x_continuous(breaks=c(0,25,50,75,100))
p_summary

pdf("../figure_table/leukemia_mds.pdf", width=8, height=5)
print(p_summary)
dev.off()
## quartz_off_screen 
##                 2
# Wilcoxon rank test
pval_wilcox_late
##                         PC.1         PC.2      PC.3
## UniFrac          0.135431301 0.1310596149 0.8488535
## Weighted UniFrac 0.672460669 0.6130066586 0.7591171
## Bray-Curtis      0.006876426 0.6130066586 0.4910435
## Jaccard          0.005848879 0.5229909514 0.4303680
## TEMPTED                   NA 0.0000156904        NA
## CTF              0.029485574           NA        NA
colnames(pval_wilcox_late) <- paste("Component", 1:3)
write.csv(pval_wilcox_late, '../figure_table/leukemia_wilcox_pcoa.csv')